{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Syllable tokenizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "\n",
    "This tutorial is available as an IPython notebook at [Malaya/example/tokenizer-syllable](https://github.com/huseinzol05/Malaya/tree/master/example/tokenizer-syllable).\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-warning\">\n",
    "\n",
    "This module only suitable for standard language structure, so it is not save to use it for local language structure.\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
      "CPU times: user 3.2 s, sys: 2.88 s, total: 6.08 s\n",
      "Wall time: 2.56 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/ssd3/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/ssd3/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import malaya"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load rules based syllable tokenizer\n",
    "\n",
    "```python\n",
    "def rules(**kwargs):\n",
    "    \"\"\"\n",
    "    Load rules based syllable tokenizer.\n",
    "    originally from https://github.com/fahadh4ilyas/syllable_splitter/blob/master/SyllableSplitter.py\n",
    "    - improved `cuaca` double vocal `ua` based on https://en.wikipedia.org/wiki/Comparison_of_Indonesian_and_Standard_Malay#Syllabification\n",
    "    - improved `rans` double consonant `ns` based on https://www.semanticscholar.org/paper/Syllabification-algorithm-based-on-syllable-rules-Musa-Kadir/a819f255f066ae0fd7a30b3534de41da37d04ea1\n",
    "    - improved `au` and `ai` double vocal.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: malaya.syllable.Tokenizer class\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = malaya.syllable.rules()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Tokenize\n",
    "\n",
    "```python\n",
    "def tokenize(self, string):\n",
    "    \"\"\"\n",
    "    Tokenize string into multiple strings using syllable patterns.\n",
    "    Example from https://www.semanticscholar.org/paper/Syllabification-algorithm-based-on-syllable-rules-Musa-Kadir/a819f255f066ae0fd7a30b3534de41da37d04ea1/figure/0,\n",
    "    'cuaca' -> ['cua', 'ca']\n",
    "    'insurans' -> ['in', 'su', 'rans']\n",
    "    'praktikal' -> ['prak', 'ti', 'kal']\n",
    "    'strategi' -> ['stra', 'te', 'gi']\n",
    "    'ayam' -> ['a', 'yam']\n",
    "    'anda' -> ['an', 'da']\n",
    "    'hantu' -> ['han', 'tu']\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    string : str\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: List[str]\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/ssd3/malaya/malaya/model/syllable.py:46: FutureWarning: Possible nested set at position 3\n",
      "  or re.findall(_expressions['ic'], word.lower())\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['a', 'ngan', '-', 'a', 'ngan']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('angan-angan')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['cua', 'ca']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('cuaca')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['hi', 'dup']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('hidup')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['in', 'su', 'ran']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('insuran')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['in', 'su', 'rans']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('insurans')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a', 'yam']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('ayam')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['stra', 'te', 'gi']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('strategi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['han', 'tu']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('hantu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['hel', 'lo']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize('hello')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Better performance\n",
    "\n",
    "Split by words and tokenize it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "string = 'sememang-memangnya kau sakai siot'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['se', 'me', 'mang', '-', 'me', 'mang', 'nya', 'kau', 'sa', 'kai', 'siot']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results = []\n",
    "for w in string.split():\n",
    "    results.extend(tokenizer.tokenize(w))\n",
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List available HuggingFace models\n",
    "\n",
    "We are also provide syllable tokenizer using deep learning, trained on DBP dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mesolitica/syllable-lstm': {'Size (MB)': 35.2,\n",
       "  'hidden size': 512,\n",
       "  'CER': 0.011996584781229728,\n",
       "  'WER': 0.06915983606557377}}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.syllable.available_huggingface"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load deep learning model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "model = malaya.syllable.huggingface()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Tokenize\n",
    "\n",
    "```python\n",
    "def tokenize(self, string, beam_search: bool = False):\n",
    "    \"\"\"\n",
    "    Tokenize string into multiple strings using deep learning.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    string : str\n",
    "    beam_search : bool, (optional=False)\n",
    "        If True, use beam search decoder, else use greedy decoder.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: List[str]\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a', 'ngan', 'a', 'ngan']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.tokenize('angan-angan')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['in', 'su', 'ran']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.tokenize('insuran')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['in', 'sur', 'ans']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.tokenize('insurans')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Harder example\n",
    "\n",
    "Test set from DBP at https://huggingface.co/datasets/mesolitica/syllable/raw/main/test-syllable.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "\n",
    "r = requests.get('https://huggingface.co/datasets/mesolitica/syllable/raw/main/test-syllable.json')\n",
    "test_set = r.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1952"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(test_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_wer(actual, hyp):\n",
    "    \"\"\"\n",
    "    Calculate WER using `python-Levenshtein`.\n",
    "    \"\"\"\n",
    "    import Levenshtein as Lev\n",
    "\n",
    "    b = set(actual.split() + hyp.split())\n",
    "    word2char = dict(zip(b, range(len(b))))\n",
    "\n",
    "    w1 = [chr(word2char[w]) for w in actual.split()]\n",
    "    w2 = [chr(word2char[w]) for w in hyp.split()]\n",
    "\n",
    "    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.09016393442622951"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wers = []\n",
    "for test in test_set:\n",
    "    t = tokenizer.tokenize(test[0])\n",
    "    t = [t_ for t_ in t if t_ not in ['-']]\n",
    "    wer = calculate_wer(test[1], '.'.join(t))\n",
    "    wers.append(wer)\n",
    "    \n",
    "sum(wers) / len(wers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original: mengilukan\n",
      "actual: ['me', 'ngi', 'lu', 'kan']\n",
      "predicted: ['me', 'ngi', 'lu', 'kan']\n",
      "\n",
      "original: menjongkok\n",
      "actual: ['men', 'jong', 'kok']\n",
      "predicted: ['men', 'jong', 'kok']\n",
      "\n",
      "original: tergabas\n",
      "actual: ['ter', 'ga', 'bas']\n",
      "predicted: ['ter', 'ga', 'bas']\n",
      "\n",
      "original: perunding\n",
      "actual: ['pe', 'run', 'ding']\n",
      "predicted: ['pe', 'run', 'ding']\n",
      "\n",
      "original: kemahalan\n",
      "actual: ['ke', 'ma', 'ha', 'lan']\n",
      "predicted: ['ke', 'ma', 'ha', 'lan']\n",
      "\n",
      "original: renggang\n",
      "actual: ['reng', 'gang']\n",
      "predicted: ['reng', 'gang']\n",
      "\n",
      "original: bersuci\n",
      "actual: ['ber', 'su', 'ci']\n",
      "predicted: ['ber', 'su', 'ci']\n",
      "\n",
      "original: jelebat\n",
      "actual: ['je', 'le', 'bat']\n",
      "predicted: ['je', 'le', 'bat']\n",
      "\n",
      "original: rekod\n",
      "actual: ['re', 'kod']\n",
      "predicted: ['re', 'kod']\n",
      "\n",
      "original: amang\n",
      "actual: ['a', 'mang']\n",
      "predicted: ['a', 'mang']\n",
      "\n",
      "original: aromaterapi\n",
      "actual: ['a', 'ro', 'ma', 'te', 'ra', 'pi']\n",
      "predicted: ['a', 'ro', 'ma', 'te', 'ra', 'pi']\n",
      "\n",
      "original: pengkompaunan\n",
      "actual: ['peng', 'kom', 'pau', 'nan']\n",
      "predicted: ['peng', 'kom', 'pau', 'nan']\n",
      "\n",
      "original: payah\n",
      "actual: ['pa', 'yah']\n",
      "predicted: ['pa', 'yah']\n",
      "\n",
      "original: menghargai\n",
      "actual: ['meng', 'har', 'ga', 'i']\n",
      "predicted: ['meng', 'har', 'gai']\n",
      "\n",
      "original: keterpaksaan\n",
      "actual: ['ke', 'ter', 'pak', 'sa', 'an']\n",
      "predicted: ['ke', 'ter', 'pak', 'sa', 'an']\n",
      "\n",
      "original: kerempagi\n",
      "actual: ['ke', 'rem', 'pa', 'gi']\n",
      "predicted: ['ke', 'rem', 'pa', 'gi']\n",
      "\n",
      "original: pengancaman\n",
      "actual: ['pe', 'ngan', 'ca', 'man']\n",
      "predicted: ['pe', 'ngan', 'ca', 'man']\n",
      "\n",
      "original: kedwilogaman\n",
      "actual: ['ke', 'dwi', 'lo', 'ga', 'man']\n",
      "predicted: ['ked', 'wi', 'lo', 'ga', 'man']\n",
      "\n",
      "original: copeng\n",
      "actual: ['co', 'peng']\n",
      "predicted: ['co', 'peng']\n",
      "\n",
      "original: antienzim\n",
      "actual: ['an', 'ti', 'en', 'zim']\n",
      "predicted: ['an', 'ti', 'en', 'zim']\n",
      "\n",
      "original: angkar\n",
      "actual: ['ang', 'kar']\n",
      "predicted: ['ang', 'kar']\n",
      "\n",
      "original: menjembak\n",
      "actual: ['men', 'jem', 'bak']\n",
      "predicted: ['men', 'jem', 'bak']\n",
      "\n",
      "original: tanggah\n",
      "actual: ['tang', 'gah']\n",
      "predicted: ['tang', 'gah']\n",
      "\n",
      "original: berjujuk\n",
      "actual: ['ber', 'ju', 'juk']\n",
      "predicted: ['ber', 'ju', 'juk']\n",
      "\n",
      "original: nestapa\n",
      "actual: ['nes', 'ta', 'pa']\n",
      "predicted: ['nes', 'ta', 'pa']\n",
      "\n",
      "original: engku\n",
      "actual: ['eng', 'ku']\n",
      "predicted: ['eng', 'ku']\n",
      "\n",
      "original: undang-undang\n",
      "actual: ['un', 'dang', 'un', 'dang']\n",
      "predicted: ['un', 'dang', '-', 'un', 'dang']\n",
      "\n",
      "original: tiket\n",
      "actual: ['ti', 'ket']\n",
      "predicted: ['ti', 'ket']\n",
      "\n",
      "original: janin\n",
      "actual: ['ja', 'nin']\n",
      "predicted: ['ja', 'nin']\n",
      "\n",
      "original: pakuk\n",
      "actual: ['pa', 'kuk']\n",
      "predicted: ['pa', 'kuk']\n",
      "\n",
      "original: betika\n",
      "actual: ['be', 'ti', 'ka']\n",
      "predicted: ['be', 'ti', 'ka']\n",
      "\n",
      "original: nangoi\n",
      "actual: ['na', 'ngoi']\n",
      "predicted: ['na', 'ngo', 'i']\n",
      "\n",
      "original: mulato\n",
      "actual: ['mu', 'la', 'to']\n",
      "predicted: ['mu', 'la', 'to']\n",
      "\n",
      "original: peruasan\n",
      "actual: ['pe', 'rua', 'san']\n",
      "predicted: ['pe', 'rua', 'san']\n",
      "\n",
      "original: terkajang\n",
      "actual: ['ter', 'ka', 'jang']\n",
      "predicted: ['ter', 'ka', 'jang']\n",
      "\n",
      "original: menjanda\n",
      "actual: ['men', 'jan', 'da']\n",
      "predicted: ['men', 'jan', 'da']\n",
      "\n",
      "original: menautkan\n",
      "actual: ['me', 'naut', 'kan']\n",
      "predicted: ['me', 'naut', 'kan']\n",
      "\n",
      "original: khalayak\n",
      "actual: ['kha', 'la', 'yak']\n",
      "predicted: ['kha', 'la', 'yak']\n",
      "\n",
      "original: lena\n",
      "actual: ['le', 'na']\n",
      "predicted: ['le', 'na']\n",
      "\n",
      "original: kesurupan\n",
      "actual: ['ke', 'su', 'ru', 'pan']\n",
      "predicted: ['ke', 'su', 'ru', 'pan']\n",
      "\n",
      "original: meneriak-neriakkan\n",
      "actual: ['me', 'ne', 'riak', 'ne', 'riak', 'kan']\n",
      "predicted: ['me', 'ne', 'ri', 'ak', '-', 'ne', 'ri', 'ak', 'kan']\n",
      "\n",
      "original: bergumpal\n",
      "actual: ['ber', 'gum', 'pal']\n",
      "predicted: ['ber', 'gum', 'pal']\n",
      "\n",
      "original: rodat\n",
      "actual: ['ro', 'dat']\n",
      "predicted: ['ro', 'dat']\n",
      "\n",
      "original: sepukal\n",
      "actual: ['se', 'pu', 'kal']\n",
      "predicted: ['se', 'pu', 'kal']\n",
      "\n",
      "original: kerani\n",
      "actual: ['ke', 'ra', 'ni']\n",
      "predicted: ['ke', 'ra', 'ni']\n",
      "\n",
      "original: mewahyukan\n",
      "actual: ['me', 'wah', 'yu', 'kan']\n",
      "predicted: ['me', 'wah', 'yu', 'kan']\n",
      "\n",
      "original: berprestij\n",
      "actual: ['ber', 'pres', 'tij']\n",
      "predicted: ['ber', 'pres', 'tij']\n",
      "\n",
      "original: dingin\n",
      "actual: ['di', 'ngin']\n",
      "predicted: ['di', 'ngin']\n",
      "\n",
      "original: lipas\n",
      "actual: ['li', 'pas']\n",
      "predicted: ['li', 'pas']\n",
      "\n",
      "original: berdingkit-dingkit\n",
      "actual: ['ber', 'ding', 'kit', 'ding', 'kit']\n",
      "predicted: ['ber', 'ding', 'kit', '-', 'ding', 'kit']\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for test in test_set[:50]:\n",
    "    print('original:', test[0])\n",
    "    print('actual:', test[1].split('.'))\n",
    "    t = tokenizer.tokenize(test[0])\n",
    "    print('predicted:', t)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0630122950819672"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wers = []\n",
    "for test in test_set:\n",
    "    t = model.tokenize(test[0])\n",
    "    t = [t_ for t_ in t if t_ not in ['-']]\n",
    "    wer = calculate_wer(test[1], '.'.join(t))\n",
    "    wers.append(wer)\n",
    "    \n",
    "sum(wers) / len(wers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original: mengilukan\n",
      "actual: ['me', 'ngi', 'lu', 'kan']\n",
      "predicted: ['me', 'ngi', 'lu', 'kan']\n",
      "\n",
      "original: menjongkok\n",
      "actual: ['men', 'jong', 'kok']\n",
      "predicted: ['men', 'jong', 'kok']\n",
      "\n",
      "original: tergabas\n",
      "actual: ['ter', 'ga', 'bas']\n",
      "predicted: ['ter', 'ga', 'bas']\n",
      "\n",
      "original: perunding\n",
      "actual: ['pe', 'run', 'ding']\n",
      "predicted: ['pe', 'run', 'ding']\n",
      "\n",
      "original: kemahalan\n",
      "actual: ['ke', 'ma', 'ha', 'lan']\n",
      "predicted: ['ke', 'ma', 'ha', 'lan']\n",
      "\n",
      "original: renggang\n",
      "actual: ['reng', 'gang']\n",
      "predicted: ['reng', 'gang']\n",
      "\n",
      "original: bersuci\n",
      "actual: ['ber', 'su', 'ci']\n",
      "predicted: ['ber', 'su', 'ci']\n",
      "\n",
      "original: jelebat\n",
      "actual: ['je', 'le', 'bat']\n",
      "predicted: ['je', 'le', 'bat']\n",
      "\n",
      "original: rekod\n",
      "actual: ['re', 'kod']\n",
      "predicted: ['re', 'kod']\n",
      "\n",
      "original: amang\n",
      "actual: ['a', 'mang']\n",
      "predicted: ['a', 'mang']\n",
      "\n",
      "original: aromaterapi\n",
      "actual: ['a', 'ro', 'ma', 'te', 'ra', 'pi']\n",
      "predicted: ['a', 'ro', 'ma', 'te', 'ra', 'pi']\n",
      "\n",
      "original: pengkompaunan\n",
      "actual: ['peng', 'kom', 'pau', 'nan']\n",
      "predicted: ['peng', 'kom', 'pau', 'nan']\n",
      "\n",
      "original: payah\n",
      "actual: ['pa', 'yah']\n",
      "predicted: ['pa', 'yah']\n",
      "\n",
      "original: menghargai\n",
      "actual: ['meng', 'har', 'ga', 'i']\n",
      "predicted: ['meng', 'har', 'ga', 'i']\n",
      "\n",
      "original: keterpaksaan\n",
      "actual: ['ke', 'ter', 'pak', 'sa', 'an']\n",
      "predicted: ['ke', 'ter', 'pak', 'sa', 'an']\n",
      "\n",
      "original: kerempagi\n",
      "actual: ['ke', 'rem', 'pa', 'gi']\n",
      "predicted: ['ke', 'rem', 'pa', 'gi']\n",
      "\n",
      "original: pengancaman\n",
      "actual: ['pe', 'ngan', 'ca', 'man']\n",
      "predicted: ['pe', 'ngan', 'ca', 'man']\n",
      "\n",
      "original: kedwilogaman\n",
      "actual: ['ke', 'dwi', 'lo', 'ga', 'man']\n",
      "predicted: ['ke', 'dwi', 'lo', 'ga', 'man']\n",
      "\n",
      "original: copeng\n",
      "actual: ['co', 'peng']\n",
      "predicted: ['co', 'peng']\n",
      "\n",
      "original: antienzim\n",
      "actual: ['an', 'ti', 'en', 'zim']\n",
      "predicted: ['an', 'tien', 'zim']\n",
      "\n",
      "original: angkar\n",
      "actual: ['ang', 'kar']\n",
      "predicted: ['ang', 'kar']\n",
      "\n",
      "original: menjembak\n",
      "actual: ['men', 'jem', 'bak']\n",
      "predicted: ['men', 'jem', 'bak']\n",
      "\n",
      "original: tanggah\n",
      "actual: ['tang', 'gah']\n",
      "predicted: ['tang', 'gah']\n",
      "\n",
      "original: berjujuk\n",
      "actual: ['ber', 'ju', 'juk']\n",
      "predicted: ['ber', 'ju', 'juk']\n",
      "\n",
      "original: nestapa\n",
      "actual: ['nes', 'ta', 'pa']\n",
      "predicted: ['nes', 'ta', 'pa']\n",
      "\n",
      "original: engku\n",
      "actual: ['eng', 'ku']\n",
      "predicted: ['eng', 'ku']\n",
      "\n",
      "original: undang-undang\n",
      "actual: ['un', 'dang', 'un', 'dang']\n",
      "predicted: ['un', 'dang', 'un', 'dang']\n",
      "\n",
      "original: tiket\n",
      "actual: ['ti', 'ket']\n",
      "predicted: ['ti', 'ket']\n",
      "\n",
      "original: janin\n",
      "actual: ['ja', 'nin']\n",
      "predicted: ['ja', 'nin']\n",
      "\n",
      "original: pakuk\n",
      "actual: ['pa', 'kuk']\n",
      "predicted: ['pa', 'kuk']\n",
      "\n",
      "original: betika\n",
      "actual: ['be', 'ti', 'ka']\n",
      "predicted: ['be', 'ti', 'ka']\n",
      "\n",
      "original: nangoi\n",
      "actual: ['na', 'ngoi']\n",
      "predicted: ['na', 'ngo', 'i']\n",
      "\n",
      "original: mulato\n",
      "actual: ['mu', 'la', 'to']\n",
      "predicted: ['mu', 'la', 'to']\n",
      "\n",
      "original: peruasan\n",
      "actual: ['pe', 'rua', 'san']\n",
      "predicted: ['pe', 'rua', 'san']\n",
      "\n",
      "original: terkajang\n",
      "actual: ['ter', 'ka', 'jang']\n",
      "predicted: ['ter', 'ka', 'jang']\n",
      "\n",
      "original: menjanda\n",
      "actual: ['men', 'jan', 'da']\n",
      "predicted: ['men', 'jan', 'da']\n",
      "\n",
      "original: menautkan\n",
      "actual: ['me', 'naut', 'kan']\n",
      "predicted: ['me', 'naut', 'kan']\n",
      "\n",
      "original: khalayak\n",
      "actual: ['kha', 'la', 'yak']\n",
      "predicted: ['kha', 'la', 'yak']\n",
      "\n",
      "original: lena\n",
      "actual: ['le', 'na']\n",
      "predicted: ['le', 'na']\n",
      "\n",
      "original: kesurupan\n",
      "actual: ['ke', 'su', 'ru', 'pan']\n",
      "predicted: ['ke', 'su', 'ru', 'pan']\n",
      "\n",
      "original: meneriak-neriakkan\n",
      "actual: ['me', 'ne', 'riak', 'ne', 'riak', 'kan']\n",
      "predicted: ['me', 'ne', 'riak', 'ne', 'riak', 'kan']\n",
      "\n",
      "original: bergumpal\n",
      "actual: ['ber', 'gum', 'pal']\n",
      "predicted: ['ber', 'gum', 'pal']\n",
      "\n",
      "original: rodat\n",
      "actual: ['ro', 'dat']\n",
      "predicted: ['ro', 'dat']\n",
      "\n",
      "original: sepukal\n",
      "actual: ['se', 'pu', 'kal']\n",
      "predicted: ['se', 'pu', 'kal']\n",
      "\n",
      "original: kerani\n",
      "actual: ['ke', 'ra', 'ni']\n",
      "predicted: ['ke', 'ra', 'ni']\n",
      "\n",
      "original: mewahyukan\n",
      "actual: ['me', 'wah', 'yu', 'kan']\n",
      "predicted: ['me', 'wah', 'yu', 'kan']\n",
      "\n",
      "original: berprestij\n",
      "actual: ['ber', 'pres', 'tij']\n",
      "predicted: ['ber', 'pres', 'tij']\n",
      "\n",
      "original: dingin\n",
      "actual: ['di', 'ngin']\n",
      "predicted: ['di', 'ngin']\n",
      "\n",
      "original: lipas\n",
      "actual: ['li', 'pas']\n",
      "predicted: ['li', 'pas']\n",
      "\n",
      "original: berdingkit-dingkit\n",
      "actual: ['ber', 'ding', 'kit', 'ding', 'kit']\n",
      "predicted: ['ber', 'ding', 'kit', 'ding', 'kit']\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for test in test_set[:50]:\n",
    "    print('original:', test[0])\n",
    "    print('actual:', test[1].split('.'))\n",
    "    t = model.tokenize(test[0])\n",
    "    print('predicted:', t)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
